{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyarrow import parquet\n",
    "import pyarrow\n",
    "import datasets\n",
    "\n",
    "import transformers\n",
    "import numpy as np\n",
    "\n",
    "from tqdm.notebook import tqdm\n",
    "\n",
    "\n",
    "DATASET = \"../../dataset_processed_orca/openchat3.1_orca_train\"\n",
    "TOKENIZER_NAME = \"/data/one/LLaMA_13B_with_EOT_token\"\n",
    "\n",
    "# Load tokenizer\n",
    "tokenizer = transformers.AutoTokenizer.from_pretrained(TOKENIZER_NAME)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading and preparing dataset parquet/openchat3.1_orca_train to /home/one/.cache/huggingface/datasets/parquet/openchat3.1_orca_train-8a8b488621d0ce71/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "571c0c2a73e2431cb51ae1e3d57c0f54",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "151d30bc55424d51825a774218a476fe",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "807a1d4b613a478f90b6fd33eec0791e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating train split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset parquet downloaded and prepared to /home/one/.cache/huggingface/datasets/parquet/openchat3.1_orca_train-8a8b488621d0ce71/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.\n"
     ]
    }
   ],
   "source": [
    "# Load dataset\n",
    "dataset = datasets.load_dataset(DATASET, split=\"train\", keep_in_memory=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "ds_parquet = parquet.read_table(DATASET + \"/data.parquet\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3225785"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(ds_parquet)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "00d988d340e2402a99af707c338fa24b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/4058285 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "supervise_len = np.array([pyarrow.compute.sum(x.values).as_py() for x in tqdm(ds_parquet.column(\"masks\"))])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Length statistics\n",
    "lengths = np.array(ds_parquet.column(\"length\").to_pylist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "459"
      ]
     },
     "execution_count": 72,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.sum((lengths == 2048) & (supervise_len < 10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'<s> User: Please answer the following question: Information:  - Shawntae Harris (born April 14, 1974), better known by her stage name Da Brat, is an American rapper and actress from Joliet, Illinois. Beginning her career in 1991, Harris debut album \"Funkdafied\" (1994) sold one million copies, making her the first female solo rap act to receive a platinum certification, and the second overall female rap act (solo or group) after Salt N Pepa. She has received two Grammy Award nominations.  - Tamia Marilyn Hill (born May 9, 1975), known professionally as Tamia, is a Canadian singer-songwriter. She is best known for her first Top 40 hit on the R&B charts \"You Put a Move on My Heart\", her 2001 hit \"Stranger in My House\", Fabolous\\' 2003 hit \"Into You\" (which samples her 1998 song \"So into You\"), her 2006 hit \"Me\", and her 2012 hit \"Beautiful Surprise\". With a career spanning over two decades, she has garnered six career Grammy nominations for her musical work.  - Christopher Brian \"Chris\" Bridges (born September 11, 1977), better known by his stage name Ludacris, is an American hip hop recording artist and actor from Atlanta, Georgia. Alongside his manager, Chaka Zulu, Ludacris is the co-founder of Disturbing tha Peace, an imprint distributed by Def Jam Recordings. Throughout his career, Ludacris has won Screen Actors Guild, Critic\\'s Choice, MTV, and Grammy Awards. Along with fellow Atlanta-based rappers Big Boi and André 3000 of OutKast, Ludacris was one of the first and most influential \"Dirty South\" rappers to achieve mainstream success during the early 2000s.  - Alicia Augello Cook (born January 25, 1981), known by her stage name Alicia Keys, is an American singer, songwriter, pianist and actress. Keys released her debut album with J Records, having had previous record deals first with Columbia and then Arista Records. Keys\\' debut album, \"Songs in A Minor\" was released in 2001, producing her first \"Billboard\" Hot 100 number-one single \"Fallin\\'\", and selling over 12 million copies worldwide. The album earned Keys five Grammy Awards in 2002. Her sophomore album, \"The Diary of Alicia Keys\", was released in 2003, spawning successful singles \"You Don\\'t Know My Name\", \"If I Ain\\'t Got You\" and \"Diary\", and selling eight million copies worldwide. The duet song \"My Boo\" with Usher, scored her a second number-one single in 2004. The album garnered her an additional four Grammy Awards in 2005. Later that year, she released her first live album, \"Unplugged\", becoming the first woman to have an \"MTV Unplugged\" album debut at number one.  - Aretha Louise Franklin (born March 25, 1942) is an American singer, songwriter and musician. Franklin began her career singing gospel at her father, minister C. L. Franklin\\'s church as a child. In 1960, at the age of 18, Franklin embarked on a secular career, recording for Columbia Records but only achieving modest success. Following her signing to Atlantic Records in 1967, Franklin achieved commercial acclaim and success with songs such as \"Respect\", \"(You Make Me Feel Like) A Natural Woman\" and \"Think\". These hits and more helped her to gain the title The Queen of Soul by the end of the 1960s decade.  - Jermaine Dupri Mauldin (born September 23, 1972), known as Jermaine Dupri or JD, is an American hip hop recording artist, record producer, songwriter and rapper. He was born in Asheville, NC and was raised in Atlanta, Georgia. He has worked with and produced Mariah Carey, Usher, Jay-Z, Nelly, Monica, Da Brat, Xscape, Janet Jackson, TLC, Aretha Franklin, Ludacris, Alicia Keys, Bow Wow, Miss Mulatto, and most recently The Rap Game season 2 winner, Mani.  - John David Jackson (born November 18, 1977), better known by his stage name Fabolous, is an American hip hop recording artist from Brooklyn, New York City. Jackson\\'s career began when he was a senior in high school and ended up rapping live on American record producer and music executive DJ Clue\\'s radio show, then on Hot 97. Jackson was subsequently signed by DJ Clue to his label Desert Storm, and later secured a distribution deal with Elektra Records. Fabolous\\' first release, \"Ghetto Fabolous\" (2001), spawned the hit singles \"Can\\'t Deny It\" and \"Young\\'n (Holla Back)\", which led Jackson to prominence. His second release was 2003\\'s \"Street Dreams\", which was supported by two Top 10 singles \"Can\\'t Let You Go\" and \"Into You\".  - `` Still \\'\\' is a song by Canadian recording artist Tamia . It was written and produced by Bryan Michael Cox , Jermaine Dupri for her third studio album More and released as is fourth single in 2004 . The maxi CD single includes 10 dance remixes of the title song , as well as the Mike Rizzo remixes of Tamia \\'s singles , `` Tell Me Who \\'\\' and `` Stranger in My House \\'\\' . This song is also re-released on Tamia \\'s Beautiful Surprise album . Tamia re-recorded the song with new arrangement and production by Luke Laird .  - Cornell Iral Haynes, Jr. (born November 2, 1974), known professionally as Nelly, is an American rapper, singer, songwriter, entrepreneur, investor, and occasional actor from St. Louis, Missouri. Nelly embarked on his music career with Midwest hip hop group St. Lunatics, in 1993 and signed to Universal Records in 1999. Under Universal, Nelly began his solo career in the year 2000, with his debut album \"Country Grammar\", of which the title-track was a top ten hit. The album debuted at number three on the \"Billboard\" 200 and went on to peak at number one. \"Country Grammar\" is Nelly\\'s best-selling album to date, selling over 8.4 million copies in the United States. His following album \"Nellyville\", produced the number-one hits \"Hot in Herre\" and \"Dilemma\" (featuring Kelly Rowland). Other singles included \"Work It\" (featuring Justin Timberlake), \"Air Force Ones\" (featuring Murphy Lee and St. Lunatics), \"Pimp Juice\" and \"#1\".  - Mariah Carey (born March 27, 1969 or 1970) is an American singer, songwriter, record producer, and actress. In 1990, she rose to fame with the release of \"Vision of Love\" from her eponymous debut album. The album produced four chart-topping singles in the US and began what would become a string of commercially successful albums which solidified the singer as Columbias highest selling act. Carey and Boyz II Men spent a record sixteen weeks atop the \"Billboard\" Hot 100 in 19951996 with \"One Sweet Day\", which remains the longest-running number-one song in US chart history. Following a contentious divorce from Sony Music head Tommy Mottola, Carey adopted a new image and traversed towards hip hop with the release of \"Butterfly\" (1997). In 1998, she was honored as the world\\'s best-selling recording artist of the 1990s at the World Music Awards and subsequently named the best-selling female artist of the millennium in 2000.    \\'still \\' is related to which object entity through the relation of \\'record label\\'?  Choices: - 1995  - 1996  - album  - atlantic records  - columbia records  - def jam recordings  - disturbing tha peace  - elektra records  - j records  - my boo  - record  - universal records\\nAnswer:<|end_of_turn|> Assistant GPT3: J Records'"
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.decode(ds_parquet.column(\"tokens\")[1439305].as_py())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.31643916311707815"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.sum(supervise_len) / np.sum(lengths)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "26 418.277155026512 2048\n",
      "[4054500 2155684 1536589 3592824 1231999  328651 2835248 3489625 3046371\n",
      " 1965088  886623 2924761  886617 2835270 3726799 3827710 1739044 2407680\n",
      " 1016836 4003094 1536786  275018 3947613  667172   36209 3827622  886499\n",
      " 2407810  886491 3592698 3046158 2835410 2155465 2835411 3046151  206122\n",
      " 1232122 1232117   91533 1337040 2602593 1016842  558564 2835342 1536697\n",
      " 3046272 1337078 1536705 2835360 1232110 1232088  392724 1337066 3254874\n",
      " 1739016 3046253 2407714 3318635 4054410 4054428 2155704 4054507 3046411\n",
      " 2602870 4054622 3947429 3254645  886730 2835036 2407403 2602849 1964919\n",
      " 3254656 1739358 1536375 3046646 2155892 2407414 1739350 3784567 2155870\n",
      " 4054604  206229  392622 3592948 2407296 2155980 3947398 3254598 1337300\n",
      " 2407314 3489507 1964901  392619 1964905  450636 2155950  274912 3254624\n",
      "  786675 3827792 3592950  450626  621214  558310 1536392   36099 3726720\n",
      " 1536489 2602792 3489574 2835170 1965033 1337178 2602764 4054551 2407525\n",
      " 2155721 3046452  786759 1536542 2835191 4054518 1231989 2155712  153924\n",
      " 3489538 3254726 3046548 3489544 3366312 1739320 3775300 2602830 2835094\n",
      " 3254705 3046598 3726718 3993243 1337214  886692  786697  667301 2407468\n",
      " 1739282  558419 4054557 2407454 1964456 1231589 2406724 1963752 1231267\n",
      " 1115896 2157133  206579 3593620 1634305  206575 2406127  557692 3593612\n",
      " 3828342 1535003 2406146 3318612 3488957 1963771 1963750 2833860   91910\n",
      "  206587 3366976 2519483 1338002 1338001  451000 3253886  450997  786148\n",
      " 1740564 1740597 1740593 2833857 4055333  206594 1740587  887429 2406054\n",
      "  557661 4055340 3047945 3593600  206565  887325 1963850 2603870 2406227\n",
      " 3047836 1740495  786234 3254015  450947 1740490 2603822 3047796  887300\n",
      " 3593550 1115843 3366903 3254034 1231328 1337892  153608 1740524  667968\n",
      " 1740563 3946807 3593591 2406181 2157072 3726062  786194 3828316 2603872\n",
      " 2603880 1337931 2406205 1115862 3533524   35768  557745 3366919 3946827\n",
      "  450954 2157217  451006 1740632   91957 3593778 2064076 2405823 2405826\n",
      " 3048240  621185  887538 3828451  329105 3725905 3367033 2405863 3253799\n",
      " 1231171  274513 2604077 1016030 1740802  329090  557518 1634331  887559\n",
      " 2943708 1740856 3048293 1534668   91975 1534671 3367074 2157392  392240\n",
      " 1534681 2157405 3593784 3775355 1231134 1963588  451077  329109 3048285\n",
      " 3254036 2833757 3828417 4003019  887457 1740685 3048144 2604023 2157260\n",
      " 2833800  668042 1016041 2406026  786127 2157234 1231237 2157233 1115913\n",
      " 3593669 2406029 1740633 1963709 1338063  451034 2405992 2157286 3048192\n",
      " 2604047 2833767 3253815 1740720 3253818 3828400 1016040 3048178 3048176\n",
      " 3367006 1534829 2833794  451036 1534846 2604026 2405991  451041  557806\n",
      " 2156986 1231336 2834265 1016341 2834271 2156626 1115620 1231508  153752\n",
      " 2406610 1964310 4003039 1740109 3366721 1016352 2064114 1115610  328891\n",
      " 2603440 1964380  557946 2834321   35875 3489214 1740192  450887 1231475\n",
      " 3593373 3047563 1337770 2156677 1016319  667785   91805 3593367 3828118\n",
      " 2406576 1964277 2156655   67684 3308700 3828092 3047541 2156691  557967\n",
      " 1740056 1016385 2156533 3047356 1652365 2156531 1535582 3047318 2519505\n",
      " 3047358 2519507  786450  558048 3947113 1739999 1964417 3593300 3047283\n",
      " 2156498 3366660 1535499 2834361 2834355 1964395 2156577 3254224   35889\n",
      " 2603389  450849 1231552 2406659 3254259  450846 1535525 3254243  887089\n",
      "  206434 1964397 1231558 3047374 3993273  557990 2831666 4055087 3254146\n",
      " 3533523 3489073 2834053 2406320 1740382 2603732 1535202 3946905 2156934\n",
      " 2156904  153675 1337843 1337841 3489097 1535238  328979 3047680 1964088\n",
      " 1535221  887211 3047729 1963979 1115822 1016186 1963918  557815 2406291\n",
      " 1115796  206532 1963948 1535193 1337871 3828245 4055202 1016203 3366887\n",
      " 3047745 2833992  153672  611423  557816 3593376  667873 3047670 3047604\n",
      " 2406551 1964207 2834180 1535347 2156711 2406562 3366791 2603604 3828139\n",
      " 1740216 1964219 3254142  450892 1115669 3047585 3489170 1535398 2064092\n",
      " 1535288 2834160 3254122 1231404 3946934 3946938 2603662 1964138 1964139\n",
      " 1740304 1016287 1016297  557868 3254113 3047630 2603643  153711 3047617\n",
      "   91831 2834157 2156744  667852 1280988]\n"
     ]
    }
   ],
   "source": [
    "# Length statistics\n",
    "lengths = np.array(dataset[\"length\"])\n",
    "\n",
    "print (np.min(lengths), np.mean(lengths), np.max(lengths))\n",
    "\n",
    "print (np.argsort(lengths)[-500:])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<s> Assistant is GPT4<|end_of_turn|> You are a helpful assistant, who always provide explanation. Think like you are answering to a five year old.<|end_of_turn|> User: Which is an appropriate title for this article?\n",
      "\n",
      "Deutsche Bank AG and Thomas Weisel Partners LLC will pay a total of \\$100 million to settle allegations they published misleading stock research to win investment-banking business, becoming <|end_of_turn|> Assistant: \"Big Banks Pay Money to Make Problem Go Away\"<|end_of_turn|>\n",
      "<s> Assistant is GPT4<|end_of_turn|>\n",
      "\n",
      " You are a helpful assistant, who always provide explanation. Think like you are answering to a five year old.<|end_of_turn|>\n",
      "\n",
      " User: Which is an appropriate title for this article?\n",
      "\n",
      "Deutsche Bank AG and Thomas Weisel Partners LLC will pay a total of \\$100 million to settle allegations they published misleading stock research to win investment-banking business, becoming <|end_of_turn|>\n",
      "\n",
      " Assistant:\n",
      "=================\n",
      "\"Big Banks Pay Money to Make Problem Go Away\"<|end_of_turn|>\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Print the supervised and unsupervised text\n",
    "sample_index = 1001\n",
    "\n",
    "item = dataset[sample_index]\n",
    "\n",
    "token_ids = np.array(item[\"1_tokens\"])\n",
    "masks     = np.array(item[\"1_masks\"])\n",
    "\n",
    "print(tokenizer.decode(token_ids))\n",
    "print(tokenizer.decode(token_ids[~masks].tolist(), spaces_between_special_tokens=False).replace(\"<|end_of_turn|>\", \"<|end_of_turn|>\\n\\n\"))\n",
    "print(\"=================\")\n",
    "print(tokenizer.decode(token_ids[masks].tolist(), spaces_between_special_tokens=False).replace(\"<|end_of_turn|>\", \"<|end_of_turn|>\\n\\n\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'<s> You are an AI assistant. You will be given a task. You must generate a detailed and long answer.<|end_of_turn|> User: How is \"\"Thank you, Lady.\" said in Czech?<|end_of_turn|> Assistant GPT3: In Czech, the phrase \"Thank you, Lady\" can be translated as \"Děkuji, paní.\" \\n\\nThe word \"Děkuji\" means \"Thank you\" and is a polite way of expressing gratitude. It is commonly used in everyday conversations and interactions.\\n\\nThe word \"paní\" means \"Lady\" and is used to address a woman in a polite and respectful manner. It is commonly used in formal settings or when addressing someone who is older or in a position of authority.\\n\\nWhen combined, \"Děkuji, paní\" is a polite and respectful way of expressing gratitude to a woman. It can be used in a variety of situations, such as thanking a waitress for her service or expressing appreciation to a female colleague for her help.<|end_of_turn|>'"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.decode(dataset[3][\"tokens\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "torch",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
